This document is designed to cover some of the high points of sparse clustering and its implementation in R using the sparcl.
A Note: Sparcl package was removed from CRAN on 7/20/2018 “as check problems were not corrected despite reminders.”(See page at https://cran.r-project.org/web/packages/sparcl/index.html.) However, there is another version out there by Kondo, Salibian-Bareera and Zamar (2016) called RSKC.
#installs and libraries the RSKC package
#install.packages('RSKC')
suppressMessages(library(RSKC))
#installs and libraries the Sparcl package (possibly deprecated)
#install the most recent version then use R-studio to install from archived file
#install.packages("C:/Users/r74t532/Downloads/sparcl_1.0.3.tar.gz", repos = NULL, type = "source")
#devtools::install_version('sparcl',version = '1.0.3')
#library(sparcl) #works on my laptop but not OPA computernba <- read.csv('data/na.csv', header = TRUE)
#pca-based
pc1 <- prcomp(nba[,-c(1,2,3)], scale = TRUE, center = TRUE)
summary(pc1)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.0615 1.9545 1.27386 1.21837 0.98520 0.90297
## Proportion of Variance 0.4687 0.1910 0.08114 0.07422 0.04853 0.04077
## Cumulative Proportion 0.4687 0.6596 0.74079 0.81501 0.86354 0.90431
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.73939 0.64388 0.57694 0.44294 0.43444 0.33548
## Proportion of Variance 0.02734 0.02073 0.01664 0.00981 0.00944 0.00563
## Cumulative Proportion 0.93164 0.95237 0.96902 0.97883 0.98826 0.99389
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.26188 0.17055 0.11187 0.07452 0.05996 0.04939
## Proportion of Variance 0.00343 0.00145 0.00063 0.00028 0.00018 0.00012
## Cumulative Proportion 0.99732 0.99877 0.99940 0.99968 0.99986 0.99998
## PC19 PC20
## Standard deviation 0.01562 0.01345
## Proportion of Variance 0.00001 0.00001
## Cumulative Proportion 0.99999 1.00000
#do some clustering
library(mclust)## Package 'mclust' version 5.4.1
## Type 'citation("mclust")' for citing this R package in publications.
modclust <- mclustBIC(pc1$x[,1:2])
mc <- Mclust(pc1$x[,1:2], x = modclust)
#build a dataframe
dat1 <- tibble(pc1$x[,1],pc1$x[,2],mc$classification); names(dat1) <- c('s1','s2','class')
#build a plot
p <- ggplot(dat1) + geom_point(aes(s1,s2,color = factor(class))) + ggtitle("NBA Rookies 2017") + theme_classic()
ggplotly(p)##################
##Compare the Sparse Clustering Methods to this:
library(RSKC)
spk <- RSKC(nba[,-c(1,2,3)], ncl = 3, alpha = 0, L1 = 1) # Sparse K-Means
#see documentation but alpha = 0 and l1 = 1 gives sparse K means
#to get "robust" sparse k-means, we need alpha >0 and L1 = 1
rspk <- RSKC(nba[,-c(1,2,3)], ncl = 3, alpha = .5, L1 = 1) # Robust Sparse K-Means
dat1$spk <- spk$labels
dat1$rspk <- rspk$labels
#gives the sparse k-means
p2 <- ggplot(dat1) + geom_point(aes(s1,s2,color = factor(spk))) + ggtitle("NBA Rookies 2017") + theme_classic()
ggplotly(p2)#gives the robust sparse k-means
p3 <- ggplot(dat1) + geom_point(aes(s1,s2,color = factor(rspk))) + ggtitle("NBA Rookies 2017") + theme_classic()
ggplotly(p3)wine <- read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header = FALSE)
names(wine) <- c("Class","Alcohol",'Malic Acid','Ash','Alcalinity of ash','Magnesium','Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins','Color Intensity',
'Hue','0d280/od315 of diluted wines','Proline')
head(wine)## Class Alcohol Malic Acid Ash Alcalinity of ash Magnesium Total phenols
## 1 1 14.23 1.71 2.43 15.6 127 2.80
## 2 1 13.20 1.78 2.14 11.2 100 2.65
## 3 1 13.16 2.36 2.67 18.6 101 2.80
## 4 1 14.37 1.95 2.50 16.8 113 3.85
## 5 1 13.24 2.59 2.87 21.0 118 2.80
## 6 1 14.20 1.76 2.45 15.2 112 3.27
## Flavanoids Nonflavanoid phenols Proanthocyanins Color Intensity Hue
## 1 3.06 0.28 2.29 5.64 1.04
## 2 2.76 0.26 1.28 4.38 1.05
## 3 3.24 0.30 2.81 5.68 1.03
## 4 3.49 0.24 2.18 7.80 0.86
## 5 2.69 0.39 1.82 4.32 1.04
## 6 3.39 0.34 1.97 6.75 1.05
## 0d280/od315 of diluted wines Proline
## 1 3.92 1065
## 2 3.40 1050
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
#typical kmeans
km.wine <- kmeans(wine, centers = 3)
#sparse kmeans
sparse.wine <- RSKC(wine, ncl = 3, alpha = 0, L1 = 1)
#robust kmeans
robust.wine <- RSKC(wine, ncl = 3, alpha = 0.5, L1 = 1)
##compare cluster membership
clusters <- tibble(wine$Class,km.wine$cluster,sparse.wine$labels, robust.wine$labels)
datatable(clusters, options = list(pageLength = 5, scrollX = 75))In this case, we have \(n = 335 institutions\) with 8 different characteristics.